We start by loading the previous “prep” file and check if the data is the same.
df <- read.csv("FPdata_PREP.csv", stringsAsFactors = TRUE)
str(df)
## 'data.frame': 426043 obs. of 40 variables:
## $ State : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : int 0 0 0 0 0 1 0 0 0 0 ...
## $ GeneralHealth : int 4 5 4 5 2 1 4 3 3 3 ...
## $ PhysicalHealthDays : int 0 0 2 0 2 1 0 0 0 1 ...
## $ MentalHealthDays : int 0 0 3 0 0 0 0 0 0 0 ...
## $ LastCheckupTime : int 5 1 5 5 5 5 5 5 5 5 ...
## $ PhysicalActivities : int 0 0 1 1 1 0 1 0 1 1 ...
## $ SleepHours : int 8 6 5 7 9 7 7 8 6 7 ...
## $ RemovedTeeth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HadHeartAttack : int 0 0 0 0 0 1 0 0 0 0 ...
## $ HadAngina : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HadStroke : int 0 0 0 0 0 1 0 0 0 0 ...
## $ HadAsthma : int 0 0 0 1 0 0 0 0 0 0 ...
## $ HadSkinCancer : int 0 1 1 0 0 0 0 0 1 0 ...
## $ HadCOPD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HadDepressiveDisorder : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HadKidneyDisease : int 0 0 0 0 0 0 0 0 0 1 ...
## $ HadArthritis : int 0 0 0 1 0 0 0 1 1 0 ...
## $ HadDiabetes : int 1 0 0 0 0 1 0 0 0 1 ...
## $ DeafOrHardOfHearing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BlindOrVisionDifficulty : int 0 0 0 0 0 0 0 0 1 0 ...
## $ DifficultyConcentrating : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DifficultyWalking : int 0 0 0 0 0 0 0 0 1 0 ...
## $ DifficultyDressingBathing: int 0 0 0 0 0 0 0 0 0 0 ...
## $ DifficultyErrands : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SmokerStatus : int 0 0 0 1 0 0 1 0 1 0 ...
## $ ECigaretteUsage : int 0 1 1 1 1 1 1 1 0 1 ...
## $ ChestScan : int 0 0 0 1 1 0 0 1 0 0 ...
## $ RaceEthnicityCategory : Factor w/ 6 levels "0 - NA's","Black only, Non-Hispanic",..: 6 6 6 6 6 6 2 6 6 6 ...
## $ AgeCategory : int 13 13 8 10 5 13 13 13 12 11 ...
## $ HeightInMeters : num 1.68 1.6 1.57 1.65 1.57 1.8 1.65 1.63 1.7 1.68 ...
## $ WeightInKilograms : num 90.7 68 63.5 63.5 54 ...
## $ BMI : num 26.6 26.6 25.6 23.3 21.8 ...
## $ AlcoholDrinkers : int 0 0 0 0 1 0 1 0 0 1 ...
## $ HIVTesting : int 0 0 0 0 0 0 0 0 1 0 ...
## $ FluVaxLast12 : int 1 0 0 1 0 0 0 1 0 1 ...
## $ PneumoVaxEver : int 0 0 0 1 1 1 0 1 0 1 ...
## $ TetanusLast10Tdap : int 1 1 0 1 1 1 1 1 1 1 ...
## $ HighRiskLastYear : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CovidPos : int 0 0 1 0 0 0 0 0 0 0 ...
summary(df)
## State Gender GeneralHealth PhysicalHealthDays
## Min. : 1.0 Min. :0.0000 Min. :1.000 Min. : 0.000
## 1st Qu.:16.0 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 0.000
## Median :27.0 Median :0.0000 Median :4.000 Median : 0.000
## Mean :28.4 Mean :0.4745 Mean :3.466 Mean : 4.076
## 3rd Qu.:43.0 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.: 3.000
## Max. :54.0 Max. :1.0000 Max. :5.000 Max. :30.000
## MentalHealthDays LastCheckupTime PhysicalActivities SleepHours
## Min. : 0.000 Min. :1.000 Min. :0.0000 Min. : 1.000
## 1st Qu.: 0.000 1st Qu.:5.000 1st Qu.:1.0000 1st Qu.: 6.000
## Median : 0.000 Median :5.000 Median :1.0000 Median : 7.000
## Mean : 4.158 Mean :4.548 Mean :0.7697 Mean : 7.031
## 3rd Qu.: 4.000 3rd Qu.:5.000 3rd Qu.:1.0000 3rd Qu.: 8.000
## Max. :30.000 Max. :5.000 Max. :1.0000 Max. :24.000
## RemovedTeeth HadHeartAttack HadAngina HadStroke
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.00000
## Mean :0.4712 Mean :0.05596 Mean :0.05893 Mean :0.04286
## 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.00000
## HadAsthma HadSkinCancer HadCOPD HadDepressiveDisorder
## Min. :0.000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.144 Mean :0.08114 Mean :0.07712 Mean :0.1981
## 3rd Qu.:0.000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## HadKidneyDisease HadArthritis HadDiabetes DeafOrHardOfHearing
## Min. :0.00000 Min. :0.000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.000 Median :0.0000 Median :0.00000
## Mean :0.04431 Mean :0.333 Mean :0.1389 Mean :0.08754
## 3rd Qu.:0.00000 3rd Qu.:1.000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.000 Max. :1.0000 Max. :1.00000
## BlindOrVisionDifficulty DifficultyConcentrating DifficultyWalking
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.05174 Mean :0.1085 Mean :0.1427
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
## DifficultyDressingBathing DifficultyErrands SmokerStatus ECigaretteUsage
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :1.0000
## Mean :0.03426 Mean :0.06833 Mean :0.3662 Mean :0.7501
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## ChestScan RaceEthnicityCategory AgeCategory
## Min. :0.0000 0 - NA's : 13565 Min. : 1.000
## 1st Qu.:0.0000 Black only, Non-Hispanic : 32934 1st Qu.: 5.000
## Median :0.0000 Hispanic : 40998 Median : 8.000
## Mean :0.3684 Multiracial, Non-Hispanic : 9024 Mean : 7.705
## 3rd Qu.:1.0000 Other race only, Non-Hispanic: 22004 3rd Qu.:11.000
## Max. :1.0000 White only, Non-Hispanic :307518 Max. :13.000
## HeightInMeters WeightInKilograms BMI AlcoholDrinkers
## Min. :0.910 Min. : 22.68 Min. :14.51 Min. :0.0000
## 1st Qu.:1.630 1st Qu.: 68.04 1st Qu.:24.37 1st Qu.:0.0000
## Median :1.680 Median : 81.65 Median :26.63 Median :1.0000
## Mean :1.702 Mean : 81.74 Mean :27.53 Mean :0.5843
## 3rd Qu.:1.780 3rd Qu.: 90.72 3rd Qu.:30.34 3rd Qu.:1.0000
## Max. :2.410 Max. :285.00 Max. :40.91 Max. :1.0000
## HIVTesting FluVaxLast12 PneumoVaxEver TetanusLast10Tdap
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.2866 Mean :0.5779 Mean :0.3413 Mean :0.8125
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## HighRiskLastYear CovidPos
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.03821 Mean :0.2757
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000
levels(df$RaceEthnicityCategory)
## [1] "0 - NA's" "Black only, Non-Hispanic"
## [3] "Hispanic" "Multiracial, Non-Hispanic"
## [5] "Other race only, Non-Hispanic" "White only, Non-Hispanic"
First of all we have to deal with the NA’s in RaceEthnicityCategory as we kept it to avoid multicollinearity on regression models.
print("Total row count:")
## [1] "Total row count:"
prev_rows <- nrow(df)
print(prev_rows)
## [1] 426043
print("RaceEthnicityCategory level count:")
## [1] "RaceEthnicityCategory level count:"
table(df$RaceEthnicityCategory)
##
## 0 - NA's Black only, Non-Hispanic
## 13565 32934
## Hispanic Multiracial, Non-Hispanic
## 40998 9024
## Other race only, Non-Hispanic White only, Non-Hispanic
## 22004 307518
print("RaceEthnicityCategory level 0 - NA's %:")
## [1] "RaceEthnicityCategory level 0 - NA's %:"
prev_nas <- table(df$RaceEthnicityCategory)["0 - NA's"]
prev_nas/ prev_rows * 100
## 0 - NA's
## 3.183951
Usually we can replace them with the mode but as that level only represents 3% of the data and we will still be above 400.000 rows we will simply drop them.
df <- droplevels(df[df$RaceEthnicityCategory != "0 - NA's", ])
table(df$RaceEthnicityCategory)
##
## Black only, Non-Hispanic Hispanic
## 32934 40998
## Multiracial, Non-Hispanic Other race only, Non-Hispanic
## 9024 22004
## White only, Non-Hispanic
## 307518
print(paste("Predicted rows:",prev_rows-prev_nas))
## [1] "Predicted rows: 412478"
print(paste("Actual rows: ",nrow(df)))
## [1] "Actual rows: 412478"
str(df)
## 'data.frame': 412478 obs. of 40 variables:
## $ State : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Gender : int 0 0 0 0 0 1 0 0 0 0 ...
## $ GeneralHealth : int 4 5 4 5 2 1 4 3 3 3 ...
## $ PhysicalHealthDays : int 0 0 2 0 2 1 0 0 0 1 ...
## $ MentalHealthDays : int 0 0 3 0 0 0 0 0 0 0 ...
## $ LastCheckupTime : int 5 1 5 5 5 5 5 5 5 5 ...
## $ PhysicalActivities : int 0 0 1 1 1 0 1 0 1 1 ...
## $ SleepHours : int 8 6 5 7 9 7 7 8 6 7 ...
## $ RemovedTeeth : int 1 1 1 1 1 1 1 1 1 1 ...
## $ HadHeartAttack : int 0 0 0 0 0 1 0 0 0 0 ...
## $ HadAngina : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HadStroke : int 0 0 0 0 0 1 0 0 0 0 ...
## $ HadAsthma : int 0 0 0 1 0 0 0 0 0 0 ...
## $ HadSkinCancer : int 0 1 1 0 0 0 0 0 1 0 ...
## $ HadCOPD : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HadDepressiveDisorder : int 0 0 0 0 0 0 0 0 0 0 ...
## $ HadKidneyDisease : int 0 0 0 0 0 0 0 0 0 1 ...
## $ HadArthritis : int 0 0 0 1 0 0 0 1 1 0 ...
## $ HadDiabetes : int 1 0 0 0 0 1 0 0 0 1 ...
## $ DeafOrHardOfHearing : int 0 0 0 0 0 0 0 0 0 0 ...
## $ BlindOrVisionDifficulty : int 0 0 0 0 0 0 0 0 1 0 ...
## $ DifficultyConcentrating : int 0 0 0 0 0 0 0 0 0 0 ...
## $ DifficultyWalking : int 0 0 0 0 0 0 0 0 1 0 ...
## $ DifficultyDressingBathing: int 0 0 0 0 0 0 0 0 0 0 ...
## $ DifficultyErrands : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SmokerStatus : int 0 0 0 1 0 0 1 0 1 0 ...
## $ ECigaretteUsage : int 0 1 1 1 1 1 1 1 0 1 ...
## $ ChestScan : int 0 0 0 1 1 0 0 1 0 0 ...
## $ RaceEthnicityCategory : Factor w/ 5 levels "Black only, Non-Hispanic",..: 5 5 5 5 5 5 1 5 5 5 ...
## $ AgeCategory : int 13 13 8 10 5 13 13 13 12 11 ...
## $ HeightInMeters : num 1.68 1.6 1.57 1.65 1.57 1.8 1.65 1.63 1.7 1.68 ...
## $ WeightInKilograms : num 90.7 68 63.5 63.5 54 ...
## $ BMI : num 26.6 26.6 25.6 23.3 21.8 ...
## $ AlcoholDrinkers : int 0 0 0 0 1 0 1 0 0 1 ...
## $ HIVTesting : int 0 0 0 0 0 0 0 0 1 0 ...
## $ FluVaxLast12 : int 1 0 0 1 0 0 0 1 0 1 ...
## $ PneumoVaxEver : int 0 0 0 1 1 1 0 1 0 1 ...
## $ TetanusLast10Tdap : int 1 1 0 1 1 1 1 1 1 1 ...
## $ HighRiskLastYear : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CovidPos : int 0 0 1 0 0 0 0 0 0 0 ...
summary(df)
## State Gender GeneralHealth PhysicalHealthDays
## Min. : 1.00 Min. :0.0000 Min. :1.000 Min. : 0.000
## 1st Qu.:16.00 1st Qu.:0.0000 1st Qu.:3.000 1st Qu.: 0.000
## Median :27.00 Median :0.0000 Median :4.000 Median : 0.000
## Mean :28.41 Mean :0.4719 Mean :3.467 Mean : 4.068
## 3rd Qu.:43.00 3rd Qu.:1.0000 3rd Qu.:4.000 3rd Qu.: 3.000
## Max. :54.00 Max. :1.0000 Max. :5.000 Max. :30.000
## MentalHealthDays LastCheckupTime PhysicalActivities SleepHours
## Min. : 0.000 Min. :1.000 Min. :0.0000 Min. : 1.000
## 1st Qu.: 0.000 1st Qu.:5.000 1st Qu.:1.0000 1st Qu.: 6.000
## Median : 0.000 Median :5.000 Median :1.0000 Median : 7.000
## Mean : 4.154 Mean :4.552 Mean :0.7699 Mean : 7.034
## 3rd Qu.: 4.000 3rd Qu.:5.000 3rd Qu.:1.0000 3rd Qu.: 8.000
## Max. :30.000 Max. :5.000 Max. :1.0000 Max. :24.000
## RemovedTeeth HadHeartAttack HadAngina HadStroke
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.4707 Mean :0.0558 Mean :0.05887 Mean :0.04267
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## HadAsthma HadSkinCancer HadCOPD HadDepressiveDisorder
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.1443 Mean :0.08153 Mean :0.07701 Mean :0.1992
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## HadKidneyDisease HadArthritis HadDiabetes DeafOrHardOfHearing
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.04436 Mean :0.3335 Mean :0.1389 Mean :0.08687
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## BlindOrVisionDifficulty DifficultyConcentrating DifficultyWalking
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.05136 Mean :0.1084 Mean :0.1425
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
## DifficultyDressingBathing DifficultyErrands SmokerStatus ECigaretteUsage
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :1.0000
## Mean :0.03392 Mean :0.06807 Mean :0.3667 Mean :0.7528
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## ChestScan RaceEthnicityCategory AgeCategory
## Min. :0.000 Black only, Non-Hispanic : 32934 Min. : 1.000
## 1st Qu.:0.000 Hispanic : 40998 1st Qu.: 5.000
## Median :0.000 Multiracial, Non-Hispanic : 9024 Median : 8.000
## Mean :0.369 Other race only, Non-Hispanic: 22004 Mean : 7.689
## 3rd Qu.:1.000 White only, Non-Hispanic :307518 3rd Qu.:11.000
## Max. :1.000 Max. :13.000
## HeightInMeters WeightInKilograms BMI AlcoholDrinkers
## Min. :0.910 Min. : 22.68 Min. :14.51 Min. :0.0000
## 1st Qu.:1.630 1st Qu.: 68.04 1st Qu.:24.37 1st Qu.:0.0000
## Median :1.680 Median : 81.65 Median :26.63 Median :1.0000
## Mean :1.702 Mean : 81.68 Mean :27.54 Mean :0.5841
## 3rd Qu.:1.780 3rd Qu.: 90.72 3rd Qu.:30.41 3rd Qu.:1.0000
## Max. :2.410 Max. :285.00 Max. :40.91 Max. :1.0000
## HIVTesting FluVaxLast12 PneumoVaxEver TetanusLast10Tdap
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.0000 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.2871 Mean :0.5785 Mean :0.3433 Mean :0.8158
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## HighRiskLastYear CovidPos
## Min. :0.00000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000
## Mean :0.03842 Mean :0.2774
## 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.0000
Since we will be doing some clustering, it’s better to normalize all the numeric variables.
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
factor_i <- which(colnames(df) == 'RaceEthnicityCategory')
df_nor <- as.data.frame(lapply(df[-c(factor_i)], normalize))
df_nor <-cbind(df_nor, df[,factor_i])
colnames(df_nor)[ncol(df_nor)] <- 'RaceEthnicityCategory'
summary(df_nor)
## State Gender GeneralHealth PhysicalHealthDays
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.2830 1st Qu.:0.0000 1st Qu.:0.5000 1st Qu.:0.0000
## Median :0.4906 Median :0.0000 Median :0.7500 Median :0.0000
## Mean :0.5172 Mean :0.4719 Mean :0.6168 Mean :0.1356
## 3rd Qu.:0.7925 3rd Qu.:1.0000 3rd Qu.:0.7500 3rd Qu.:0.1000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## MentalHealthDays LastCheckupTime PhysicalActivities SleepHours
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.:0.2174
## Median :0.0000 Median :1.0000 Median :1.0000 Median :0.2609
## Mean :0.1385 Mean :0.8881 Mean :0.7699 Mean :0.2624
## 3rd Qu.:0.1333 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.3043
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## RemovedTeeth HadHeartAttack HadAngina HadStroke
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.00000
## 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000
## Median :0.0000 Median :0.0000 Median :0.00000 Median :0.00000
## Mean :0.4707 Mean :0.0558 Mean :0.05887 Mean :0.04267
## 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.00000
## HadAsthma HadSkinCancer HadCOPD HadDepressiveDisorder
## Min. :0.0000 Min. :0.00000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :0.00000 Median :0.00000 Median :0.0000
## Mean :0.1443 Mean :0.08153 Mean :0.07701 Mean :0.1992
## 3rd Qu.:0.0000 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:0.0000
## Max. :1.0000 Max. :1.00000 Max. :1.00000 Max. :1.0000
## HadKidneyDisease HadArthritis HadDiabetes DeafOrHardOfHearing
## Min. :0.00000 Min. :0.0000 Min. :0.0000 Min. :0.00000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.00000
## Median :0.00000 Median :0.0000 Median :0.0000 Median :0.00000
## Mean :0.04436 Mean :0.3335 Mean :0.1389 Mean :0.08687
## 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:0.0000 3rd Qu.:0.00000
## Max. :1.00000 Max. :1.0000 Max. :1.0000 Max. :1.00000
## BlindOrVisionDifficulty DifficultyConcentrating DifficultyWalking
## Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.00000 Median :0.0000 Median :0.0000
## Mean :0.05136 Mean :0.1084 Mean :0.1425
## 3rd Qu.:0.00000 3rd Qu.:0.0000 3rd Qu.:0.0000
## Max. :1.00000 Max. :1.0000 Max. :1.0000
## DifficultyDressingBathing DifficultyErrands SmokerStatus ECigaretteUsage
## Min. :0.00000 Min. :0.00000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.00000 1st Qu.:0.00000 1st Qu.:0.0000 1st Qu.:1.0000
## Median :0.00000 Median :0.00000 Median :0.0000 Median :1.0000
## Mean :0.03392 Mean :0.06807 Mean :0.3667 Mean :0.7528
## 3rd Qu.:0.00000 3rd Qu.:0.00000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.00000 Max. :1.00000 Max. :1.0000 Max. :1.0000
## ChestScan AgeCategory HeightInMeters WeightInKilograms
## Min. :0.000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.000 1st Qu.:0.3333 1st Qu.:0.4800 1st Qu.:0.1729
## Median :0.000 Median :0.5833 Median :0.5133 Median :0.2248
## Mean :0.369 Mean :0.5574 Mean :0.5281 Mean :0.2249
## 3rd Qu.:1.000 3rd Qu.:0.8333 3rd Qu.:0.5800 3rd Qu.:0.2594
## Max. :1.000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## BMI AlcoholDrinkers HIVTesting FluVaxLast12
## Min. :0.0000 Min. :0.0000 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.3735 1st Qu.:0.0000 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.4591 Median :1.0000 Median :0.0000 Median :1.0000
## Mean :0.4935 Mean :0.5841 Mean :0.2871 Mean :0.5785
## 3rd Qu.:0.6023 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.0000 Max. :1.0000
## PneumoVaxEver TetanusLast10Tdap HighRiskLastYear CovidPos
## Min. :0.0000 Min. :0.0000 Min. :0.00000 Min. :0.0000
## 1st Qu.:0.0000 1st Qu.:1.0000 1st Qu.:0.00000 1st Qu.:0.0000
## Median :0.0000 Median :1.0000 Median :0.00000 Median :0.0000
## Mean :0.3433 Mean :0.8158 Mean :0.03842 Mean :0.2774
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.:0.00000 3rd Qu.:1.0000
## Max. :1.0000 Max. :1.0000 Max. :1.00000 Max. :1.0000
## RaceEthnicityCategory
## Black only, Non-Hispanic : 32934
## Hispanic : 40998
## Multiracial, Non-Hispanic : 9024
## Other race only, Non-Hispanic: 22004
## White only, Non-Hispanic :307518
##
As clustering is unsupervised machine learning we need to ignore RaceEthnicityCategory. I will start with hclust and as the full dataset is to big I take small samples. But to prove the algorithm I will test every distance and agglomeration method. We have to make a little “trick”, because the new sample dataset is random we need to take a look at how many ethnicities we have beforehand to assign the correct value to k.
set.seed(123)
idx <- sample(1:dim(df_nor)[1], 50)
df_sample <- df_nor[idx,]
clusters <- length(unique(df_sample$RaceEthnicityCategory))
distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")
for (i in 1:length(distances)) {
for (j in 1:length(methods)) {
hc <- hclust(dist(df_sample[,1:39], method="euclidean"), method=methods[[j]])
plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
paste("Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
rect.hclust(hc, k=clusters, border = 2:(clusters+1))
}
}
We can clearly see that there is no combination that groups correctly by ethnicity. This can be because the random sample we took, so lets try some others.
for (x in 1:10) {
set.seed(x)
idx <- sample(1:dim(df_nor)[1], 50)
df_sample <- df_nor[idx,]
clusters <- length(unique(df_sample$RaceEthnicityCategory))
distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")
for (i in 1:length(distances)) {
for (j in 1:length(methods)) {
hc <- hclust(dist(df_sample[,1:39], method="euclidean"), method=methods[[j]])
plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
paste("Seed:", x, "Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
rect.hclust(hc, k=clusters, border = 2:(clusters+1))
}
}
}
Once again there is no model that groups the data correctly. Before trying another approach I will try to reduce the variables just to the diseases.
set.seed(123)
idx <- sample(1:dim(df_nor)[1], 50)
df_sample <- df_nor[idx,]
clusters <- length(unique(df_sample$RaceEthnicityCategory))
distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")
for (i in 1:length(distances)) {
for (j in 1:length(methods)) {
hc <- hclust(dist(df_sample[,10:19], method="euclidean"), method=methods[[j]])
plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
paste("Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
rect.hclust(hc, k=clusters, border = 2:(clusters+1))
}
}
Now I will try to reduce the columns only taking into account one disease. As we see previously in the logistic regressions “HadSkinCancer” is the best candidate.
for (i in 1:ncol(df_nor)) {
print(paste(i,colnames(df_nor)[i]))
}
## [1] "1 State"
## [1] "2 Gender"
## [1] "3 GeneralHealth"
## [1] "4 PhysicalHealthDays"
## [1] "5 MentalHealthDays"
## [1] "6 LastCheckupTime"
## [1] "7 PhysicalActivities"
## [1] "8 SleepHours"
## [1] "9 RemovedTeeth"
## [1] "10 HadHeartAttack"
## [1] "11 HadAngina"
## [1] "12 HadStroke"
## [1] "13 HadAsthma"
## [1] "14 HadSkinCancer"
## [1] "15 HadCOPD"
## [1] "16 HadDepressiveDisorder"
## [1] "17 HadKidneyDisease"
## [1] "18 HadArthritis"
## [1] "19 HadDiabetes"
## [1] "20 DeafOrHardOfHearing"
## [1] "21 BlindOrVisionDifficulty"
## [1] "22 DifficultyConcentrating"
## [1] "23 DifficultyWalking"
## [1] "24 DifficultyDressingBathing"
## [1] "25 DifficultyErrands"
## [1] "26 SmokerStatus"
## [1] "27 ECigaretteUsage"
## [1] "28 ChestScan"
## [1] "29 AgeCategory"
## [1] "30 HeightInMeters"
## [1] "31 WeightInKilograms"
## [1] "32 BMI"
## [1] "33 AlcoholDrinkers"
## [1] "34 HIVTesting"
## [1] "35 FluVaxLast12"
## [1] "36 PneumoVaxEver"
## [1] "37 TetanusLast10Tdap"
## [1] "38 HighRiskLastYear"
## [1] "39 CovidPos"
## [1] "40 RaceEthnicityCategory"
set.seed(123)
idx <- sample(1:dim(df_nor)[1], 50)
df_sample <- df_nor[idx,]
clusters <- length(unique(df_sample$RaceEthnicityCategory))
distances <- c("euclidean", "maximum", "manhattan", "canberra", "binary", "minkowsky")
methods <- c("complete", "average", "single", "ward.D", "ward.D2", "median", "centroid", "mcquitty")
for (i in 1:length(distances)) {
for (j in 1:length(methods)) {
hc <- hclust(dist(df_sample[,-c(10:13,15:19,40)], method="euclidean"), method=methods[[j]])
plot(hc, hang = -1, cex = 0.75, labels=df_sample$RaceEthnicityCategory, main =
paste("Only \"HadSkinCancer\" Clusters:", clusters, "Distance:", distances[[i]],"Method:",methods[[j]]))
rect.hclust(hc, k=clusters, border = 2:(clusters+1))
}
}
After all of this we can clearly conclude hclust is not capable to separate our data by ethnicities.
As this file is getting a bit too long I will save the normalized dataset and continue with other clustering algorithms in a new file.
write.csv(df_nor, "FPdata_NOR.csv", row.names = FALSE)